Load some of the library packages that might be necessary for this analysis
library(ggplot2)
library(ggmap)
library(mapdata)
## Loading required package: maps
library(maps)
library(stringr)
library(viridis)
## Loading required package: viridisLite
library(plyr)
##
## Attaching package: 'plyr'
## The following object is masked from 'package:maps':
##
## ozone
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RColorBrewer)
Proposition 2 We would like to map the major pollutants in US in the States of Alabama, California, Michigan and Texas in the year of 1987 and 2017 as well as compare them across the Entire US.
Data Preparation To meet the purpose of Proposition 2, let’s try to scoop out some of the chunk of datas that we already used in our EDA1.Rmd File. And then let’s try to free some of the memory for the data frame which we aren’t using.
eda <- read.table("data/cleaned_AQ.csv", header = TRUE, sep = ",")
# Dataframe ready for Analysis
eda1 <- subset(eda, select = c(latitude,
longitude,
parameter_name,
method_name,
year,
arithmetic_mean,
arithmetic_standard_dev,
state_name))
rm(eda)
# Reading Dataframe of Entire US
edaUS <- read.table("data/eda1.csv", header = TRUE, sep = ",")
# Dataframe ready for analysis
edaUS1 <- subset(edaUS, select = c(latitude,
longitude,
parameter_name,
method_name,
year,
arithmetic_mean,
arithmetic_standard_dev,
state_name))
rm(edaUS)
# Reading Dataframe of Entire US, for Yr 1987
edaUS1987 <- read.table("data/eda1987.csv", header = TRUE, sep = ",")
# Dataframe ready for analysis
edaUS1987_1 <- subset(edaUS1987, select = c(latitude,
longitude,
parameter_name,
method_name,
year,
arithmetic_mean,
arithmetic_standard_dev,
state_name))
rm(edaUS1987)
# Reading Dataframe of Entire US, for Yr 2017
edaUS2017 <- read.table("data/eda2017.csv", header = TRUE, sep = ",")
# Dataframe ready for analysis
edaUS2017_1 <- subset(edaUS2017, select = c(latitude,
longitude,
parameter_name,
method_name,
year,
arithmetic_mean,
arithmetic_standard_dev,
state_name))
rm(edaUS2017)
# Reading Dataframe of Entire US, for both Yr of 1987 & 2017
edaUS2Yrs <- read.table("data/eda2.csv", header = TRUE, sep = ",")
# Dataframe ready for analysis
edaUS2Yrs_1 <- subset(edaUS2Yrs, select = c(latitude,
longitude,
parameter_name,
method_name,
year,
arithmetic_mean,
arithmetic_standard_dev,
state_name))
rm(edaUS2Yrs)
Let’s try to see the pollutant data in the state of Michigan, California, Texas, and Alabama for the year of 1987 and 2017.
eda2_States <- eda1 %>%
filter(state_name == "Michigan" |
state_name == "California" |
state_name == "Texas" |
state_name == "Alabama") %>%
arrange(desc(arithmetic_mean))
eda2_States %>% head()
## latitude longitude parameter_name method_name
## 1 32.72727 -117.1545 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 2 33.99958 -117.4160 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 3 33.99958 -117.4160 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 4 32.72727 -117.1545 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 5 33.12771 -117.0753 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## 6 32.79119 -116.9421 Total hydrocarbons INSTRUMENTAL - FLAME IONIZATION
## year arithmetic_mean arithmetic_standard_dev state_name
## 1 1989 28539.04 15923.682 California
## 2 1991 26747.23 7658.988 California
## 3 1989 26276.67 8485.891 California
## 4 1988 25073.71 13076.227 California
## 5 1989 22639.81 7257.409 California
## 6 1991 22370.71 5304.051 California
Let’s try to visualize the state wise distribution of the mean of the pollutants across the States of Michigan, Texas, California, and Alabama.
ggplot(data = eda2_States) +
geom_histogram(aes(x = arithmetic_mean,
binwidth = 30000,
fill = state_name)) +
scale_x_log10()
## Warning: Ignoring unknown aesthetics: binwidth
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous x-axis
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 8386 rows containing non-finite values (stat_bin).
Let’s try to map out the Pollutants distribution in the 4 States of our interest as seen in the data sets from the year of 1987 and 2017.
# pollutants <- count(eda1, parameter_name) %>% filter(freq > 300)
# count(eda1, 'parameter_name') %>% summarise(freq=n()) %>% filter(freq > 10000)
pollutants1 <- eda2_States %>%
group_by(state_name)
map1 <- get_map(location = c(lon = mean(eda1$longitude),
lat = mean(eda1$latitude)),
zoom = 4,
maptype = "satellite",
scale = 2)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=37.962973,-94.686641&zoom=4&size=640x640&scale=2&maptype=satellite&language=en-EN&sensor=false
ggmap(map1) +
geom_point(data = eda2_States,
aes(x = longitude,
y = latitude,
colour = state_name,
alpha = 0.7),
size = 5,
shape = 21) +
guides(fill=FALSE, alpha=FALSE, size=TRUE)
## Warning: Removed 2481 rows containing missing values (geom_point).
No point in keeping the same dataframe in our memory!
rm(eda1)
Let’s try to figure out the frequency of the appearence of the Pollutants in the entire dataset, in the year of 1987, in the year of 2017 and in the dataset containing the Pollutants for both the year of 1987 & 2017.
Due to the wide variety of Pollutants as seen in the dataframe, we tried here to see the pattern of the various Pollutants data that are available in these datasets for the purpose of analysis. Filtered the counts of the Pollutants which are greater than the majority of the Pollutants as seen in the dataframe.
# Pollutants across the Entire US for 30 years
# pollutants1 <- count(edaUS1, 'parameter_name') %>% filter(freq > 5000)
# count(edaUS1, 'parameter_name')
pollutants2 <- edaUS1 %>%
group_by(parameter_name) %>%
summarise(freq=n()) %>%
filter(freq > 5000)
# Histogram
colourCount = length(unique(pollutants2$parameter_name))
ggplot(pollutants2,
aes(x = reorder(parameter_name,freq),
freq,
fill = parameter_name)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = colorRampPalette(brewer.pal(12, "Accent"))(colourCount)) +
scale_y_continuous(labels = scales::comma) +
coord_flip()
## Warning in brewer.pal(12, "Accent"): n too large, allowed maximum for palette Accent is 8
## Returning the palette you asked for with that many colors
# Density Plot
ggplot(pollutants2,
aes(x=freq)) +
geom_density(fill = "paleturquoise",
alpha = 0.90) +
scale_x_log10()
# Density Overlaid on Histogram
ggplot(pollutants2,
aes(x=freq, y=..density..)) +
geom_histogram(fill="turquoise",
color="grey60",
size=.2) +
geom_density() +
scale_x_log10()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Looks like Ozone is one of the top Pollutants, apart from Sulphur dioxide, Suspended particulate (TSP) etc along with PM2.5 - Local Conditions, PM10 Total 0-10um STP. Wind Speed, Outdoor Temperature etc are also seen to be captured in that particular day when the pollutant data was measured. This kind of gives an overview of the maximum contributing Pollutant contributors in the entire US so collected over a span of 30 years.
Let’s next look into the Pollutants count across US in the year of 1987.
# Pollutants across the entire US for 1987
# pollutants1987 <- count(edaUS1987_1, 'parameter_name') %>% filter(freq > 100)
# count(edaUS1987_1, 'parameter_name')
pollutants1987 <- edaUS1987_1 %>%
group_by(parameter_name) %>%
summarise(freq=n()) %>%
filter(freq > 100)
# Visualization
ggplot(pollutants1987,
aes(x = reorder(parameter_name,freq),
freq,
fill = parameter_name)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_brewer(palette = "Set1") +
scale_y_continuous(labels = scales::comma) +
coord_flip()
Looks like Sulfur dioxide, Ozone, Suspended particulate, Carbon Monooxide are the main top contributing Pollutants in US for the year of 1987.
Let’s peep into our Pollutants data from the year of 2017 in the US.
# Pollutants across the entire US for 2017
# pollutants2017 <- count(edaUS2017_1, 'parameter_name') %>% filter(freq > 200)
# count(edaUS2017_1, 'parameter_name')
pollutants2017 <- edaUS2017_1 %>%
group_by(parameter_name) %>%
summarise(freq=n()) %>%
filter(freq > 100)
# Visualization
colourCount = length(unique(pollutants2017$parameter_name))
ggplot(pollutants2017,
aes(x = reorder(parameter_name,freq),
freq,
fill = parameter_name)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = colorRampPalette(brewer.pal(12, "Dark2"))(colourCount)) +
scale_y_continuous(labels = scales::comma) +
coord_flip()
## Warning in brewer.pal(12, "Dark2"): n too large, allowed maximum for palette Dark2 is 8
## Returning the palette you asked for with that many colors
Ozone, Sulfur dioxide along with PM10 Total 0-10um STP, PM2.5- Local Conditions are some of the top parameter_name that are noted in US in the year of 2017.
It’s interesting to note that the contributing variables acting as parameter_name to be qualified as Pollutant increased in the year of 2017 than that of the year 1987. However Ozone seemed to take second position in the top list of parameter_name.
Let’s try to look at the Pollutants data in the entire US for both 1987 and 2017.
# Pollutants across the entire US for both 1987 & 2017
# pollutants2 <- count(edaUS2Yrs_1, 'parameter_name') %>% filter(freq > 400)
# count(edaUS2Yrs_1, 'parameter_name')
pollutants3 <- edaUS2Yrs_1 %>%
group_by(parameter_name) %>%
summarise(freq=n()) %>%
filter(freq > 200)
# Visualization
colourCount = length(unique(pollutants3$parameter_name))
ggplot(pollutants3,
aes(x = reorder(parameter_name,freq),
freq,
fill = parameter_name)) +
geom_bar(stat = "identity", position = "dodge") +
scale_fill_manual(values = colorRampPalette(brewer.pal(12, "Spectral"))(colourCount)) +
scale_y_continuous(labels = scales::comma) +
coord_flip()
## Warning in brewer.pal(12, "Spectral"): n too large, allowed maximum for palette Spectral is 11
## Returning the palette you asked for with that many colors
Ozone seems to top the list of the Pollutants for the entire US in both the year of 1987 and 2017. Some of the other top contributing Pollutants that are apparently present in the entire datasets for both the year of 1987 & 2017 are Sulphur dioxide, Suspended particulate (TSP) along with some other variables such as PM2.5 - Local Conditions, PM10 Total 0-10um STP etc.
Let’s try to map it out in terms of the location in US Map.
map2 <- qmap("unitedstates", zoom = 4, color = "color", legend = "topright")
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=unitedstates&zoom=4&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=unitedstates&sensor=false
## Warning: `panel.margin` is deprecated. Please use `panel.spacing` property
## instead
map2 +
stat_bin2d(aes(x = longitude,
y = latitude,
colour = state_name,
fill = state_name),
size = .6,
bins = 30,
alpha = 1/2,
data = eda2_States)
## Warning: Removed 951 rows containing non-finite values (stat_bin2d).
## Warning: Removed 1 rows containing missing values (geom_tile).
Let’s write this file so that we can use it for next leg of analysis
write.csv(eda2_States, file = "data/eda2_States.csv")
Let’s clear some memory
rm(edaUS1987_1, edaUS2017_1, edaUS2Yrs_1, edaUS1, map1, map2, pollutants1)
Okay, so looks like we now have identified the Pollutant that have mostly contributed to the data sets in the state of Alabama, Calfornia, Michigan, and Texas in the year of both 1987 and 2017. As seen from the Map as well from the dataframe filtering we can conclude that Ozone is the top Pollutant contributor in these 4 States.